#!/usr/bin/env python
# -*- coding:utf-8 -*-

import os

import StringIO
import base64
import sys
import json
import time
import datetime
import requests
import traceback
from config import glob_conf
import hashlib
from cassandra.cluster import Cluster
from pymongo import MongoClient
from scpy.logger import get_logger
from scpy.xawesome_codechecker import timeit

from lib.OCR_CQ import ocr_cq
from lib.OCR_SC import ocr_sc
from lib.OCR_BJ import ocr_bj
from lib.OCR_TJ import ocr_tj
from lib.OCR_BJ1 import ocr_bj1
from lib.OCR_Chinese import ocr_cn
from lib.OCR_JS import ocr_js
from lib.OCR_NMG import ocr_nmg
from lib.OCR_YN import ocr_yn
from lib.OCR_LN import ocr_ln
from lib.OCR_XJ import ocr_xj
from lib.OCR_GD import ocr_gd
from lib.OCR_SD import ocr_sd
from lib.OCR_JL import ocr_jl
from lib.OCR_HB import ocr_hb
from lib.OCR_JX import ocr_jx
from lib.OCR_SAX import ocr_sax
from lib.OCR_SX import ocr_sx
from lib.OCR_XZ import ocr_xz
from lib.OCR_QH import ocr_qh
from lib.OCR_GZ import ocr_gz
from lib.OCR_HAIN import ocr_hain


CAP_DICT = {
    'ah':ocr_cn,
    'hlj':ocr_cn,
    'hen':ocr_cn,
    'gx':ocr_cn,
    'cq':ocr_cq,
    'sc':ocr_sc,
    'tj':ocr_tj,
    'bj':ocr_bj,
    'bj1':ocr_bj1,
    'nmg':ocr_nmg,
    'js':ocr_js,
    'ln':ocr_ln,
    'yn':ocr_yn,
    'xj':ocr_xj,
    'gd':ocr_gd,
    'sd':ocr_sd,
    'jl':ocr_jl,
    'hb':ocr_hb,
    'jx':ocr_jx,
    'sax':ocr_sax,
    'gz':ocr_gz,
    'sx':ocr_sx,
    'xz':ocr_xz,
    'qh':ocr_qh,
    'hain':ocr_hain,
}


PROVINCE_NAME = ['ah','hlj','hen','cq','sc','tj','gx']

CAP_SERVICE_PROVINCE = ['sd', 'jl', 'yn', 'js']

logger = get_logger(__file__)
reload(sys)
sys.setdefaultencoding("utf-8")
MONGO_CONN = MongoClient(glob_conf.MONGO_HOST)


cluster = Cluster(glob_conf.CAS_HOST)
session = cluster.connect('crawler')


DB_NAME = 'crawler_log'
COLLECTION_ID = 'saic_ids_log'
COLLECTION_COMPANY= 'saic_company_name'
COLLECTION_URL = 'saic_company_url'

# 解析出错的collection
COLLECTION_COMPANY_ERROR = 'saic_company_name_error'

def kill_captcha_from_service(data, source,format):
    """
    解析验证码服务
    :param data: 验证码内容（二进制数据）
    :param source: 来源（cq/sc/sh/...）
    :param format: 验证码格式（jpg/png/...）
    :return: 识别结果
    """
    start = time.time()
    post_data = {
        'data': base64.b64encode(data),
        'source': source,
        'format': format
    }
    try:
        result = requests.post(glob_conf.CAPTCHA_URL, data=post_data).content
        logger.info('captcha [%s-%s]' % (result, source))
        return result
    except Exception, e:
        logger.exception(e)
        return None


def insert_into_mongo(collection, filter_data, data):
    if not collection or not data:
        return False
    if not isinstance(data, dict):
        return False
    try:
        collection = MONGO_CONN[DB_NAME][collection]
        collection.update_one(filter_data, {'$set': data}, True)
        return True
    except Exception, e:
        logger.exception(e)
        return False


def save_data_cassandra(data_tuple,reg_no):
    today = datetime.datetime.today().strftime('%Y-%m-%d')
    if not data_tuple or len(data_tuple)<2:
        return
    clean_data = data_tuple[1]
    if not clean_data:
        return

    basic_list = clean_data.get('basicList')
    basic_dict = basic_list[0] if basic_list else {}

    company_name = basic_dict.get('enterpriseName','')
    reg_no_data = basic_dict.get('regNo','')
    province = clean_data.get('province','')
    if not company_name:
        logger.info('reg_no:%s no name'%reg_no_data)
        return
    data_str = json.dumps(clean_data)
    data_hash = hashlib.sha256(data_str).hexdigest()
    source = json.dumps(data_tuple[0])
    session.execute('INSERT INTO saic_by_name (company_name, reg_no, date, province, data, data_hash, source) '
                    'values (%s,%s,%s,%s,%s,%s,%s)',
                    (company_name,reg_no_data,today,province,data_str,data_hash,source))

    logger.info("done"*100)
    company_data = {}
    company_data['_id']=company_name
    company_data['companyName'] = company_name
    company_data['province'] = province
    company_data['updateTime'] = datetime.datetime.utcnow()
    insert_into_mongo(COLLECTION_COMPANY,{'_id':company_data['_id']},company_data)

    # save crawler log to cassandra
    log_cassandra(province)
    return


def save_source_parse_error(data_tuple,reg_no):
    raw_data = data_tuple[0]
    source = json.dumps(raw_data)
    company_name = raw_data.get("companyName", "")
    province = raw_data.get('province', '')
    if not company_name:
        logger.info('reg_no:%s no name' % company_name)
        return

    today = datetime.datetime.today().strftime('%Y-%m-%d')

    session.execute('INSERT INTO saic_by_name_source (company_name, date, province, source) '
                 'values (%s,%s,%s,%s)',
                 (company_name,today,province,source))

    logger.info("done"*100)
    company_data = {}
    company_data['_id']=company_name
    company_data['companyName'] = company_name
    company_data['province'] = province
    company_data['updateTime'] = datetime.datetime.utcnow()
    insert_into_mongo(COLLECTION_COMPANY_ERROR,{'_id':company_data['_id']},company_data)

    # save crawler log to cassandra
    log_cassandra(province)
    return


def save_data(data_tuple, reg_no, id_status):
    if len(data_tuple) == 3:
        url_data = data_tuple[2]
        save_url_mongo(url_data)

    if data_tuple and not data_tuple[1]:
        save_source_parse_error(data_tuple,reg_no)

    if id_status == 'value':
        save_data_cassandra(data_tuple,reg_no)

    id_data = {}
    id_data['_id'] = reg_no
    id_data['status'] = id_status
    save_log_mongo(id_data)
    return


def save_log_mongo(data):
    insert_into_mongo(COLLECTION_ID,{'_id':data['_id']},data)
    return


def save_url_mongo(data):
    insert_into_mongo(COLLECTION_URL,{'companyName':data['companyName']},data)
    return


def log_cassandra(province):
    """
    log data in cassandra
    """
    crawler_name = 'saic_%s.py' % province
    date = datetime.datetime.now().strftime('%Y-%m-%d')
    hour = datetime.datetime.now().hour
    session.execute("UPDATE crawler_log set count=count+1 WHERE date='%s' and hour=%d and crawler='%s'"
                    % (date, hour, crawler_name))
    return


@timeit
def kill_captcha(data,source,format):
    try:
        if not all([source, format, data]) or not source:
            raise Exception('no param')

        # 调用`验证码service`
        if source and source in CAP_SERVICE_PROVINCE:
            captcha = kill_captcha_from_service(data, source, format)
            return captcha

        abc = StringIO.StringIO(data)
        print source
        print source=='ln'
        captcha = apply(CAP_DICT.get(source),[abc,])

        #print 'returning.......', captcha
    except Exception ,e:
        captcha = 'wrong'
        err = traceback.format_exc(e)
        print "8"*100
        logger.exception(err)
    captcha = str(captcha)
    captcha = captcha.strip()
    return captcha

if __name__ == '__main__':
    data_str = ''
    data = json.loads(data_str)
